###########################
# SM - Obs and questions needed
###########################


# load packages
library(randomForest)
library(dplyr)
library(haven)
library(ggplot2)

# load data
load('1_output.RData')
load('3_output.RData')


###

# 1. Identify questions to ask (survey should ask more salient questions)

# a. Set ideological placement variable
cces12$ideo_placement <- ifelse(cces12$ideo5 < 6, cces12$ideo5, NA)

# b. Create formulas to explain CFscores / placements with issues
temp <- paste(policy_chars, collapse=" + ") 
formula_cf_issues <- as.formula(paste('true_CFscore', temp, sep = ' ~ '))
formula_placement_issues <- as.formula(paste('ideo_placement', temp, sep = ' ~ '))

# c. Run random forests
set.seed(1776)
rf_donorplace_importance <- randomForest(formula_placement_issues, data = cces12[cces12$donor == 1,], ntree = 1000, nodesize = 10, mtry = 2, na.action = na.omit)

# d. Arrange variables in order of importance
var_importance <- as.data.frame(rf_donorplace_importance$importance)
var_names <- row.names(var_importance)
rank <- order(-var_importance$IncNodePurity)

vars_ranked <- var_names[rank]
vars_ranked

###

# 2. Prep bootstrap analysis

# set number of donors and questions
donors <- c(100,250,500,1000)
questions <- c(3,5,8,12)

specs <- expand.grid(donors, questions)
names(specs) <- c('donors','questions')

# create columns to capture performance
specs$ideo_error <- NA
specs$pca_error <- NA
specs$cf_error <- NA

# create vectors for bootstrap population
cf_error <- rep(0, 500)
pca_error <- rep(0, 500)
ideo5_error  <- rep(0, 500)

# remove obs with missing data
cces_bootstrap <- cces12[cces12$bootstrap == T,]

###

# 3. Run bootstrap

set.seed(3000)
for (i in 1:NROW(specs)){
  
  # a. collect questions and sample
  
  temp <- paste(vars_ranked[1:specs$questions[i]], collapse = ' + ')
  formula <- as.formula(paste('true_CFscore', temp, sep = ' ~ '))
  
  temp.df <- sample_n(cces12[cces12$donor == 1,], size = specs$donors[i])
  
  # b. run new forest / impute new scores
  set.seed(1865)
  rf <- randomForest(formula,
                     data=temp.df,ntree=1000,nodesize=10,mtry = 2,
                     na.action = na.omit,keep.inbag=T)
  
  cces_bootstrap$imputed_CFscore <-  predict(rf, newdata = cces_bootstrap)
  
  # c. run bootstrap
  set.seed(2023)
  for (j in 1:500){
      temp_df <- cces_bootstrap[sample(1:NROW(cces_bootstrap), 500),]
      
      temp.fit <- glm(house_vote_gop ~ ideo5, data = temp_df, family = 'binomial')
      temp.pred <- predict(temp.fit, temp_df, type = 'response')
      temp.pred <- ifelse(temp.pred > .5, 1, 0)
      
      ideo5_error[j] <- 1 - (sum(temp.pred == temp_df$house_vote_gop)/500)
      
      temp.fit <- glm(house_vote_gop ~ pca1, data = temp_df, family = 'binomial')
      temp.pred <- predict(temp.fit, temp_df, type = 'response')
      temp.pred <- ifelse(temp.pred > .5, 1, 0)
      
      pca_error[j] <- 1 - (sum(temp.pred == temp_df$house_vote_gop)/500)
      
      temp.fit <- glm(house_vote_gop ~ imputed_CFscore, data = temp_df, family = 'binomial')
      temp.pred <- predict(temp.fit, temp_df, type = 'response')
      temp.pred <- ifelse(temp.pred > .5, 1, 0)
      
      cf_error[j] <- 1 - (sum(temp.pred == temp_df$house_vote_gop)/500)
  }
  
  # d. collect results
  specs$cf_error[i] <- mean(cf_error)
  specs$ideo_error[i] <- mean(ideo5_error)
  specs$pca_error[i] <- mean(pca_error)
}

specs$cf_compare <- specs$cf_error / specs$pca_error

###

# 4. Save results

write.csv(specs, 'sm_results.csv')

